# merging and cleaning vessel characteristics from various sources
# needs access to scikit tools for iterative interpolation

import numpy as np
import pandas as pd
import os
import fill_vessel_chars

# historic wfr
# scraping PDFs for vessels that were scrapped prior to our access to the WFR
# HAVE TO CLEAN UP by hand class variable for gas chem tankers tankers (remove fields with vessel names)
wfr_hist_file = r"H:\My Drive\Boats\ReplicationCode\data_not_to_share\vessel_characteristics\WFR_historic\%s.csv"
wfr_hist_file_tags = ['FCC','bulkers','tankers','multi','roro','reefer','chemtankers','gas']

# current WFR
wfr_file = r"H:\My Drive\Boats\ReplicationCode\data_not_to_share\vessel_characteristics\world_fleet_registry\%s_09212018.csv"
wfr_file_tags = ['autocarriers', 'bulkers', 'combos', 'container', 'generalcargo', 'lng', 'lpg', 'mpp', 'offshore', 'reefer', 'roro', 'tanker']

# supplemental data from current WFR (ownership, etc)
wfr_supp_folder = r"H:\My Drive\Boats\ReplicationCode\data_not_to_share\vessel_characteristics\world_fleet_registry\wfr_08282019"

# marine traffic data
mt_file = r"H:\My Drive\Boats\ReplicationCode\data_not_to_share\vessel_characteristics\marine_traffic_scraped.csv"

# IMO Auxilliary Engine Loads
imo_aux_file = r"H:\My Drive\Boats\ReplicationCode\data_not_to_share\vessel_characteristics\IMO_aux_loads.xlsx"

# output file 
out_file_new = r"H:\My Drive\Boats\ReplicationCode\data\wfr_wHist_new.csv" 

###############################################################################

#### dealing with supplemental vars in WFR
# loop through each raw file
directory = os.fsencode(wfr_supp_folder)
  
df_wfr_supp_list = []   
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if filename.endswith(".csv") : 
         print(os.path.join(wfr_supp_folder, filename))
         tmp = pd.read_csv(os.path.join(wfr_supp_folder, filename) , index_col=False , encoding = "ISO-8859-1" , skiprows=[0,1,2,3,4,5]  , engine='python' , sep=',', quotechar='"', error_bad_lines=False )
         
         # drop the footer
         tmp = tmp[~tmp.Name.str.contains("Clarkson Research Services")]
         
         df_wfr_supp_list.append(tmp)
           
df_wfr_supp = pd.concat(df_wfr_supp_list,axis=0,sort=True,ignore_index=True)
del df_wfr_supp_list


# load IMO Aux Engine Loads
df_IMOaux = pd.read_excel( imo_aux_file, index_col=False  )

# load current WFR #####
dfs_wfr = []
for t in wfr_file_tags :
    dfs_wfr.append( pd.read_csv( wfr_file % t , index_col=False , encoding = "ISO-8859-1", low_memory=False ) )
df_wfr = pd.concat(dfs_wfr,axis=0,sort=True)
df_wfr['from_hist'] = 0
#df_wfr = df_wfr.drop(['index'],axis=1)
del dfs_wfr


# load historic WFR ##### 
dfs_hist = [] 
for t in wfr_hist_file_tags :
    df_i = pd.read_csv( wfr_hist_file % t , encoding='utf-8')
    df_i['register'] = t 
    dfs_hist.append( df_i )
    del df_i
df_hist = pd.concat(dfs_hist,axis=0,sort=True)
df_hist = df_hist.drop(['index'],axis=1)
del dfs_hist

# load marine traffic data
df_mt = pd.read_csv( mt_file , encoding='utf-8')

# getting rid of unnamed columns
if 'Unnamed: 0' in df_hist.columns:
    df_hist = df_hist.drop('Unnamed: 0', axis=1)
if 'Unnamed: 0' in df_wfr.columns:
    df_wfr = df_wfr.drop('Unnamed: 0', axis=1)

wfr_rename_map = {"IMO Number": "IMO", "LOA (m)":"loa" , "Draught (m)":"draft" ,
                   "Dwt":"dwt", "Built":"built", "Flag State":"flag" , "Builder":"builder"  ,
                   "Beam (m)":"beam",'Type':'type','Bunker Capacity IFO 380':'cap_ifo','Bunker Capacity MDO':'cap_mdo','Bunker Capacity MGO':'cap_mgo',
                  'Engine Derived Total Mechanical Propulsion (kW)':'power_kw','MMSI (Maritime Mobile Service Identity)':'mmsi_wfr','Speed (knots)':'speed_kts',
                  'GT':'gt'}
df_wfr = df_wfr.rename( columns=wfr_rename_map )

# clean supplemental
wfr_supp_rename_map = {"IMO Number": "IMO", 'MMSI (Maritime Mobile Service Identity)':'mmsi_wfr', 'Operator Country/Region': 'Operator_Country' ,
                       'Group Owner' : 'Owner_Group' , 'Group Owner Country/Region': 'Owner_Country'}
df_wfr_supp = df_wfr_supp.rename( columns=wfr_supp_rename_map )
df_wfr_supp.columns=df_wfr_supp.columns.str.replace('/','').str.replace('(','').str.replace(')','').str.replace(' ', '_')
print( len(df_wfr_supp) )
df_wfr_supp = df_wfr_supp.drop_duplicates( subset = 'mmsi_wfr' )
print( len(df_wfr_supp) )

df_wfr_supp.IMO = np.where( df_wfr_supp.IMO=="Download Limit Exceeded" , np.nan , df_wfr_supp.IMO ).astype(float)

# calculate per cylinder displacement
df_wfr_supp['EngineDispl_cul'] = 0.5 * ( df_wfr_supp.Main_Engine_1_Bore )**2 * ( df_wfr_supp.Main_Engine_1_Stroke ) / 1e6

# vars to merge from supplemental files (Owner is in original WFR data)
supp_vars = ['Owner_Group','Owner_Country','Operator','Operator_Group','Operator_Country','EngineDispl_cul']

# merge on IMO and MMSI, combine then drop dupplicates; then merge back to full characteristics df
df_merge_mmsi = df_wfr[['IMO','mmsi_wfr']].merge(df_wfr_supp.loc[df_wfr_supp.mmsi_wfr.notna(),['mmsi_wfr']+supp_vars],on="mmsi_wfr" , how="inner" )
df_merge_imo = df_wfr[['IMO','mmsi_wfr']].merge(df_wfr_supp.loc[df_wfr_supp.IMO.notna(),['IMO']+supp_vars],on="IMO" , how="inner" )

# 
df_wfr_supp_wIMO = pd.concat([df_merge_imo,df_merge_mmsi],axis=0,sort=True).drop_duplicates(subset=["IMO"])
df_wfr_supp_wIMO = df_wfr_supp_wIMO.drop(['mmsi_wfr'],axis=1)
# one vessel here with missing IMO
# df_wfr_supp_wIMO [df_wfr_supp_wIMO.IMO.isna() ]

# making historic cols same names as current wfr
hist_rename_map = {"built_y":"built", "built_loc":"builder"  ,
                   "Class":"type", "Flag" : "flag"  }
df_hist = df_hist.rename( columns=hist_rename_map )

# combine dwt columns
df_hist['dwt'] = np.where(df_hist['dwt'].isnull() , df_hist['DWT'], df_hist['dwt']) 
df_hist['dwt'] = np.where(df_hist['dwt'].isnull() , df_hist['DWT2'], df_hist['dwt'])
df_hist = df_hist.drop(['DWT','DWT2'] , axis=1)

df_hist['power_kw'] = df_hist['hp']/1.341 #0.7457 # convert horsepower to km

# combining cons_td and con_td
df_hist['cons_td'] = np.where(df_hist['cons_td'].isnull() , df_hist['con_td'], df_hist['cons_td'])
df_hist = df_hist.drop('con_td',axis=1)

df_hist['teu'] = np.where(df_hist['teu'].isnull() , df_hist['TEU'], df_hist['teu'])
df_hist = df_hist.drop('TEU',axis=1)

# drop duplicate imo numbers (not sure why there are duplicates)
# taking ones with more observed fields
#print len(df_hist)
df_hist['cnt'] = df_hist.apply(lambda x: x.count(), axis=1)
df_hist = df_hist.sort_values(['IMO','cnt'] , axis=0 , ascending=False )
df_hist = df_hist.drop_duplicates(subset=['IMO'],keep='first')
df_hist = df_hist.drop('cnt',axis=1)

#print len(df_hist)
#print len(df_wfr)
#print len(df_mt)

# merging current and historic WFR
df_wfr = df_wfr.dropna(axis=0,subset=['IMO'])
df = pd.merge( df_wfr , df_wfr_supp_wIMO , on='IMO' , how='left') # merging supplemental
df_hist = df_hist.dropna(axis=0,subset=['IMO'])
df = pd.merge(df , df_hist , on='IMO' , how='outer')
print( len(df) )

# filling flag column with 1 if from historic
df['from_hist'] = df.from_hist.fillna(1)

# concat then remove duplicate imo (keeping current wfr value if duplicates
#df = pd.concat( [df_wfr,df_hist] , axis=0 )
#df = df.dropna(axis=0,subset=['IMO Number'])

# sorting then dropping duplicate IMO
#df = df.sort_values('from_hist' , axis=0 , ascending=True )
#df = df.drop_duplicates(subset=['IMO Number'],keep='first')

# combining columns, using historic if current is na
vars = ["type","built","builder","beam","draft","dwt","flag","loa","speed_kts","power_kw",'gt']
for v in vars :
    df[v] = np.where(df[v+'_x'].isnull() , df[v+'_y'], df[v+'_x'])
    df = df.drop([v+'_x',v+'_y'],axis=1)

# dealing with odd characters
#df['Main Engine Detail'] = df['Main Engine Detail'].str.encode('ascii', 'ignore').str.decode('ascii')
#df['Main Engine Model'] = df['Main Engine Model'].str.encode('ascii', 'ignore').str.decode('ascii')
#df['Main Engine Summary'] = df['Main Engine Summary'].str.decode('ascii', 'ignore').str.encode('ascii')
#df['Name'] = df['Name'].str.decode('ascii', 'ignore').str.encode('ascii')
#df['Owner'] = df['Owner'].str.decode('ascii', 'ignore').str.encode('ascii')

#df['Main Engine Detail']  = df['Main Engine Detail'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Main Engine Model']  = df['Main Engine Model'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Main Engine Summary']  = df['Main Engine Summary'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Name']  = df['Name'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Owner']  = df['Owner'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Owner_Group']  = df['Owner_Group'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Owner_Country']  = df['Owner_Country'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Operator_Group']  = df['Operator_Group'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Operator_Country']  = df['Operator_Country'] .replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

# cleaning types
df['type'] = df['type'].astype(str).str.replace("D/Hull " ,"")

# NOW ADD Marine Traffic Data
# can append if no duplicate IMOs, which there shouldn't be (as of 8/2019)
# need to do this before merging
df['from_MT'] = 0 ;
df_mt['from_MT'] = 1 ;
df = df.append(df_mt,ignore_index=True,sort=True)

# getting WFR classification from type to group dataframe
df_groups = df[['Group','type','register']].drop_duplicates(subset=['type'])
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='gas') , 'Group'] = "Gas Carrier"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='multi') , 'Group'] = "Other Dry Cargo"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='reefer') , 'Group'] = "Reefer"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='bulkers') , 'Group'] = "Bulk Carrier"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='roro') , 'Group'] = "Ro-Ro"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='tankers') , 'Group'] = "Tanker"
df_groups.loc[(df_groups['Group'].isnull()) & (df_groups['register']=='chemtankers') , 'Group'] = "Tanker"

# collapsing vessel types further
df_groups['group_agg'] = np.nan
df_groups.loc[ df_groups['Group'].isin(["FCC"]) , 'group_agg'] = "FCC"
df_groups.loc[ df_groups['Group'].isin(["Bulk Carrier","Bulk Ore Carrier","Bulk/Oil Carrier"]) , 'group_agg'] = "Bulker"
df_groups.loc[ df_groups['Group'].isin(["PCC","Ro-Ro","Reefer",	"Miscel. Cargo","Other Dry Cargo"]) , 'group_agg'] = "Other Cargo"
df_groups.loc[ df_groups['Group'].isin(["Tanker","Gas Carrier","Ore/Oil Carrier"]) , 'group_agg'] = "Tankers"
df_groups.loc[ df_groups['Group'].isin(["Passenger"]) , 'group_agg'] = "Passenger"
df_groups.loc[ df_groups['Group'].isin(["Miscel.","Offshore Service","Pleasure","Fishing","Military"]) , 'group_agg'] = "Miscel"

# collapsing vessel types for IMO merge 
df_groups['Group_imo'] = df_groups['Group']
df_groups.loc[ (df_groups['register']=='chemtankers') , 'Group_imo'] = "Chemical Tanker" # add chemical tankers as explicit group 
df_groups.loc[ (df_groups['Group'].isin(['Bulk Ore Carrier'])) , 'Group_imo'] = "Bulk Carrier" # combine bulk carriers
df_groups.loc[ (df_groups['Group'].isin(['Other Dry Cargo', 'Miscel. Cargo'])) , 'Group_imo'] = "General Cargo" # combine bulk carriers
df_groups.loc[ (df_groups['Group'].isin(['Bulk/Oil Carrier','Ore/Oil Carrier'])) , 'Group_imo'] = "Tanker" # combine tankers
df_groups.loc[ (df_groups['Group'].isin(['Military'])) , 'Group_imo'] = "Miscel." # combine tankers
# reclassifying some tankers
df_groups.loc[ (df_groups['type'].isin(['Waste Disposal Carrier','Water Carrier','Fruit Juice Carrier'])) , 'Group_imo'] = "Other Tanker"
df_groups.loc[ (df_groups['type'].isin(['Chem Parcel Tanker','Methanol Carrier','Phosphoric Acid Carrier','CHEMICAL TANKER','LPG/CHEMICAL TANKER'])) , 'Group_imo'] = "Chemical Tanker"

# merging in new groups
df = pd.merge(df , df_groups[['Group','type','group_agg','Group_imo']] , on='type' , how='left')
df = df.rename(columns={"Group_x": "group_orig", "Group_y": "group"})

# create index
df['id'] = df.index

# merging in IMO aux data
df['dum1'] = 1
df_IMOaux['dum1'] = 1  

df_aux = df.merge(df_IMOaux, on=['dum1','Group_imo'])

# making capacity units column
df_aux['Capac'] = np.nan
for cap in df_aux['Capacity Units'].unique() :
    df_aux.loc[df_aux['Capacity Units']==cap,'Capac'] = df_aux[df_aux['Capacity Units']==cap][cap]

# finding those that matched
df_aux_matched = df_aux.loc[(df_aux.Capac>=df_aux.Min) & (df_aux.Capac<df_aux.Max) , ['id','aux_sea','aux_berth','aux_anchor','aux_maneuver'] ]

# look at duplicates
#dups = df_aux_matched[df_aux_matched.duplicated(subset='id',keep=False)]

# merge back to vessel dataframe
df = df.merge(df_aux_matched, on='id', how='left')
df = df.drop(['id','dum1'] , axis=1)

# cleaning up outliers
# some power obs are above 400,000 and below 100
# maximum engine power should not be above 100,000kw
power_outliers = ( df.power_kw<100 )
df['power_kw']=df.power_kw.mask(power_outliers, np.nan) 
power_outliers = ( df.power_kw>100000 )
df['power_kw']=df.power_kw.mask(power_outliers, np.nan) 

# now interpolating using iterative interp from scikit
print("Interpolating Missing Values")
interp_vars = ['beam','loa','draft','built','power_kw','speed_kts','dwt','cons_td','aux_sea']
df_interp = fill_vessel_chars.interp(df,interp_vars)

# adding some variables
df_interp['speed_kmh'] = df_interp.speed_kts * 1.852 # knots to km/h


# using Betz 2011 function to get auxilliary load
#df_interp['aux_sea_lin'] = 287.2 + 0.1913*df_interp['power_kw']

# Now generating scalars for fuel consumption (alphas)
    # alpha_power - uses SFOC from CARB (195 g/kWh IFO, 185 g/kWh MGO) - not accounting for difference now
    # alpha_cons - converts consumption in tons per day to g per kWh
    # the numerator in these terms are weight fuel/hour, need to make sure they are consistent
    # EVERYTHING HERE (numerator) metric tons of fuel per hour

df_interp['alpha_power'] = ( 195.0*df_interp.power_kw * 1e-6 ) / (df_interp.speed_kmh**3)
df_interp['alpha_cons'] = ( df_interp.cons_td/24 ) / ( df_interp.speed_kmh**3 ) 

# cleaning up any end of lines
df_interp = df_interp.replace(r'\r',' ', regex=True)
df_interp = df_interp.replace(r'\n',' ', regex=True)

print("Saving")
df_interp.to_csv(out_file_new ,index=False,encoding='utf-8')

# appending new vars to old csv and saving under new name
# updates data to include supp vars
#df_old = pd.read_csv( out_file_old , encoding='utf-8',low_memory=False)

#df_old = df_old.merge(df_interp[ ['IMO'] + supp_vars], on='IMO', how="left") 
#df_old.to_csv(out_file_new ,index=False,encoding='utf-8')
    


